# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
library(lubridate)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
library(ggmap)
path.to.csv <- '../Milestone 2/Seattle_Police_Department_911_Incident_Response_Oct17.csv'
spd.911 <- read.csv(path.to.csv, TRUE)
spd.911$clearance_date_ts = as.POSIXct(strptime(spd.911$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
spd.911$clearance_date_date = as.Date(spd.911$clearance_date_ts)
View(spd.911)
# path to the FOLDER with the .shp file in it. the second param is the name of the .shp file
# seattle <- readOGR(dsn = path.expand("~/documents/INFO370/project-teamname-v2/maps-api-test"), layer = "Seattle_City_Limits")
# usa <- map_data("state")
# data <- merge(usa, spd.911)
# Red Square coordinates
here_long <-  -122.3095
here_lat <- 47.6560
seattle = get_map(location = c(here_long, here_lat), zoom = 13, maptype = 'roadmap')
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=47.656,-122.3095&zoom=13&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false

freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)
ggplot(as.data.frame(freq_by_desc), 
       aes(x = Var1, y = Freq)) +
       geom_bar(stat = 'identity') +# create bar plot
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations
        
ggmap(seattle) +
  geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5) +
  facet_wrap(~ Event.Clearance.Description) +
  theme(axis.ticks = element_blank(), 
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none"
        )

# selecting just ID and location data
df_loc <- data %>% dplyr::select(CAD.CDW.ID, Longitude, Latitude)
# figuring out number of clusters
wss <- c()
# clusters 1 to 15
for (i in 1:15) {
  wss[i] <- sum(kmeans(df_loc, centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# fitting model
fit <- kmeans(df_loc, 10)
fit$centers # look at cluster sizes and means. want clusters to be about equal size
   CAD.CDW.ID Longitude Latitude
1     2110534 -122.3090 47.65473
2     2108019 -122.3145 47.65467
3     2114454 -122.3135 47.66288
4     2115894 -122.3117 47.66063
5     2117445 -122.3115 47.66015
6     2105779 -122.3122 47.65821
7     2119756 -122.3128 47.65627
8     2122787 -122.3178 47.66599
9     2112589 -122.3124 47.66454
10    2124681 -122.3089 47.65948
fit$cluster
 [1]  6  6  6  6  6  6  6  6  6  2  2  2  2  2  2  1  1  1  1  1  1  1  9  9  9  9  9  9  9  9  9  9  3  3
[35]  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5  7  7  7  7  7  7  7  7
[69]  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8 10 10 10 10 10 10 10 10 10
cluster.size <- data.frame(1:10, fit$size)
cluster.size
ggplot(data = cluster.size, aes(x = X1.10, y = fit.size)) +
  geom_bar(stat = 'identity')

ggplot()

ggmap(seattle) +
  geom_point(data = as.data.frame(fit$centers), aes(x = Longitude, y = Latitude), alpha = 0.5)

# looking at cluster means
aggregate(df_loc, by=list(fit$cluster), FUN=mean)
df_loc
# adding data back into dataframe 
# df_loc <- df_loc %>% mutate(cluster = fit$cluster) 
data$cluster <- fit$cluster
# View(data)
# timestamp ->  year  month day hour  minute
# sector -> to factor (there are 17 sectors)
# beat -> to factor (there are 3 beats per sector)
# clean the data a bit more
data$event_clearance_ts = as.POSIXct(strptime(data$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
data$event_clearance_date = as.Date(data$event_clearance_ts)
data$event_clearance_month = month(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_day = weekdays(data$event_clearance_date)
data$event_clearance_hr = hour(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_mn = minute(ymd_hms(as.character(data$event_clearance_ts)))
data$Initial.Type.Group = factor(data$Initial.Type.Group)
data$Event.Clearance.Group = factor(data$Event.Clearance.Group)
data$Zone.Beat = factor(data$Zone.Beat)
data$District.Sector = factor(data$District.Sector)
data$event_clearance_day = factor(data$event_clearance_day)
data
col.names <- paste(c(
  "Event.Clearance.Code"
  , "cluster"
  , "Census.Tract"
  , "event_clearance_day"
  , "Event.Clearance.Group"
  , "Event.Clearance.SubGroup"
  , "District.Sector"
  , "Zone.Beat"
  #, "event_clearance_ts"
  # ,"Incident.Location"
  , "event_clearance_hr"
  , "event_clearance_mn"
  , "event_clearance_month" 
  , "Hundred.Block.Location"
  ), collapse="|")
cols <- grep(col.names, colnames(data))
cols
 [1]  4  6  7  9 10 11 12 23 26 27 28 29
# corr_matrix <- cor(data[,cols]) # correlations between all predictor vars
# corr_matrix
# cutoff <- 0.5 # should be higher in practice
# highly_corr <- findCorrelation(corr_matrix, cutoff=cutoff)
# print(colnames(spd.911)[highly_corr]) # age is highly correalted with pregnant
train.data <- select(data, cols)
train.data
# data <- data %>% droplevels()
# grep("Hundred.Block.Location", colnames(train.data), invert = T)
predictors <- grep("Hundred.Block.Location", colnames(train.data), invert = T)
outcome <- grep("Hundred.Block.Location", colnames(train.data))
# train.data[,predictors]
frame <- data.frame(train.data[,predictors])
frame
out.factor <- train.data$Hundred.Block.Location
as.vector(out.factor)
 [1] "43XX BLOCK OF 15 AV NE"                 "24XX BLOCK OF E LOUISA ST"             
 [3] "48XX BLOCK OF SAND POINT WY NE"         "42XX BLOCK OF UNIVERSITY WY NE"        
 [5] "21XX BLOCK OF N NORTHLAKE WY"           "14XX BLOCK OF NE 43 ST"                
 [7] "14XX BLOCK OF NE 43 ST"                 "17XX BLOCK OF N 45 ST"                 
 [9] "NE 54 ST / 21 AV NE"                    "29XX BLOCK OF FAIRVIEW AV E"           
[11] "SAND POINT WY NE / 40 AV NE"            "55XX BLOCK OF 12 AV NE"                
[13] "NE 43 ST / UNIVERSITY WY NE"            "E ROANOKE ST / I5 NB"                  
[15] "E HAMLIN ST / EASTLAKE AV E"            "23XX BLOCK OF 24 AV E"                 
[17] "50XX BLOCK OF UNIVERSITY WY NE"         "E MONTLAKE PL E / E LAKE WASHINGTON BV"
[19] "14XX BLOCK OF NE 43 ST"                 "16XX BLOCK OF INTERLAKEN PL E"         
[21] "8XX BLOCK OF NE 66 ST"                  "NE 47 ST / 17 AV NE"                   
[23] "45XX BLOCK OF 16 AV NE"                 "16XX BLOCK OF NE 50 ST"                
[25] "45XX BLOCK OF UNIVERSITY WY NE"         "47XX BLOCK OF UNIVERSITY WY NE"        
[27] "NE 52 ST / ROOSEVELT WY NE"             "NE 50 ST / 9 AV NE"                    
[29] "45XX BLOCK OF 25 AV NE"                 "61XX BLOCK OF BROOKLYN AV NE"          
[31] "47XX BLOCK OF UNIVERSITY WY NE"         "45XX BLOCK OF UNIVERSITY WY NE"        
[33] "15 AV NE / NE 55 ST"                    "NE 45 ST / 11 AV NE"                   
[35] "46XX BLOCK OF 27 AV NE"                 "2 AV NE / NE 50 ST"                    
[37] "11 AV NE / NE 42 ST"                    "50XX BLOCK OF 7 AV NE"                 
[39] "NE BLAKELEY ST / 25 AV NE"              "4213 1 / 2 UNIVERSITY WY NE"           
[41] "15 AV NE / NE 42 ST"                    "NE 47 ST / 9 AV NE"                    
[43] "NE 45 ST / 5 AV NE"                     "NE 52 ST / 15 AV NE"                   
[45] "47XX BLOCK OF 30 AV NE"                 "UNIVERSITY WY NE / NE 56 ST"           
[47] "ROOSEVELT WY NE / NE 45 ST"             "50XX BLOCK OF UNIVERSITY WY NE"        
[49] "41XX BLOCK OF UNIVERSITY WY NE"         "45XX BLOCK OF 15 AV NE"                
[51] "27XX BLOCK OF MONTLAKE BV E"            "14XX BLOCK OF N 42 ST"                 
[53] "38XX BLOCK OF 42 AV NE"                 "42XX BLOCK OF UNIVERSITY WY NE"        
[55] "48XX BLOCK OF SAND POINT WY NE"         "24XX BLOCK OF E LOUISA ST"             
[57] "37XX BLOCK OF CORLISS AV N"             "45XX BLOCK OF 18 AV NE"                
[59] "ROOSEVELT WY NE / NE 63 ST"             "NE 50 ST / 2 AV NE"                    
[61] "45XX BLOCK OF 25 AV NE"                 "48XX BLOCK OF SAND POINT WY NE"        
[63] "45XX BLOCK OF 9 AV NE"                  "14XX BLOCK OF N 45 ST"                 
[65] "15 AV NE / NE 50 ST"                    "23XX BLOCK OF MINOR AV E"              
[67] "24 AV E / E MONTLAKE PL E"              "50XX BLOCK OF ROOSEVELT WY NE"         
[69] "8XX BLOCK OF NE 42 ST"                  "40XX BLOCK OF UNIVERSITY WY NE"        
[71] "38XX BLOCK OF NE 57 ST"                 "9XX BLOCK OF E ROANOKE ST"             
[73] "23XX BLOCK OF EASTLAKE AV E"            "BURKE AV N / N NORTHLAKE WY"           
[75] "22XX BLOCK OF NE 51 ST"                 "52XX BLOCK OF 22 AV NE"                
[77] "N 45 ST / BURKE AV N"                   "LATONA AV NE / NE 58 ST"               
[79] "1 AV NE / N 56 ST"                      "ROOSEVELT WY NE / NE 65 ST"            
[81] "43XX BLOCK OF UNIVERSITY WY NE"         "65XX BLOCK OF 18 AV NE"                
[83] "50XX BLOCK OF 19 AV NE"                 "EASTLAKE AV E / E BOSTON ST"           
[85] "45XX BLOCK OF 12 AV NE"                 "15 AV NE / NE 50 ST"                   
[87] "14XX BLOCK OF NE 43 ST"                 "22 AV E / E MILLER ST"                 
[89] "50XX BLOCK OF RAVENNA AV NE"            "52XX BLOCK OF 15 AV NE"                
[91] "55XX BLOCK OF 17 AV NE"                 "39 AV NE / NE 55 ST"                   
control <- rfeControl(functions = rfFuncs, method="cv", number=10)
results <- rfe(frame, out.factor, sizes = c(1:13), rfeControl = control) # this will take AWHILE...
Error in { : task 1 failed - "Can't have empty classes in y."
---
title: "R Notebook"
output: html_notebook
---

```{r setup}
# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
library(lubridate)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
library(ggmap)
path.to.csv <- '../Milestone 2/Seattle_Police_Department_911_Incident_Response_Oct17.csv'
spd.911 <- read.csv(path.to.csv, TRUE)

spd.911$clearance_date_ts = as.POSIXct(strptime(spd.911$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
spd.911$clearance_date_date = as.Date(spd.911$clearance_date_ts)
View(spd.911)



# path to the FOLDER with the .shp file in it. the second param is the name of the .shp file
# seattle <- readOGR(dsn = path.expand("~/documents/INFO370/project-teamname-v2/maps-api-test"), layer = "Seattle_City_Limits")

# usa <- map_data("state")
# data <- merge(usa, spd.911)
# Red Square coordinates
here_long <-  -122.3095
here_lat <- 47.6560

seattle = get_map(location = c(here_long, here_lat), zoom = 13, maptype = 'roadmap')

```


```{r}


spd.911 <- spd.911 %>% 
             rowwise() %>% 
             mutate(dist=distVincentyEllipsoid(c(Longitude, Latitude), c(here_long, here_lat)))              
nrow(spd.911)

descriptions <- c("STRONG ARM ROBBERY", "PERSON WITH A WEAPON (NOT GUN)", "HAZARDS", "HARASSMENT, THREATS", "FIGHT DISTURBANCE", "CRISIS COMPLAINT - GENERAL", "ARMED ROBBERY")

# Removes Specifically Harassment by Telephone and Writing, as well as other non-scary crimes
data.ped <- spd.911 %>% filter(str_detect(Event.Clearance.Description, paste(descriptions, collapse="|"))) %>% filter(!str_detect(Event.Clearance.Description, "HARASSMENT, THREATS - BY TELEPHONE, WRITING"))
# data.ped <- data.now
nrow(data.ped)

# data.now <- data.ped %>% filter(clearance_date_ts < '2017-10-31 00:00:00')
nrow(data.now)
                  
data.here <- data.now %>% filter(dist < 2600)

data <- read.csv('YearSEAPD.csv', header = TRUE)
nrow(data)
# View(data)

ggmap(seattle) +
   geom_point(data = data, aes(x = Longitude, y = Latitude), colour = "red", alpha = 0.75)
  #coord_map()

```

```{r}
freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)

ggplot(as.data.frame(freq_by_desc), 
       aes(x = Var1, y = Freq)) +
       geom_bar(stat = 'identity') +# create bar plot
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations

        
```

```{r}
ggmap(seattle) +
  geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5) +
  facet_wrap(~ Event.Clearance.Description) +
  theme(axis.ticks = element_blank(), 
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none"
        )
```

```{r}
# selecting just ID and location data
df_loc <- data %>% dplyr::select(CAD.CDW.ID, Longitude, Latitude)

# figuring out number of clusters
wss <- c()
# clusters 1 to 15
for (i in 1:15) {
  wss[i] <- sum(kmeans(df_loc, centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# fitting model
fit <- kmeans(df_loc, 10)
fit$centers # look at cluster sizes and means. want clusters to be about equal size
fit$cluster
cluster.size <- data.frame(1:10, fit$size)
cluster.size

ggplot(data = cluster.size, aes(x = X1.10, y = fit.size)) +
  geom_bar(stat = 'identity')
ggplot()
ggmap(seattle) +
  geom_point(data = as.data.frame(fit$centers), aes(x = Longitude, y = Latitude), alpha = 0.5)
# looking at cluster means
aggregate(df_loc, by=list(fit$cluster), FUN=mean)

df_loc

# adding data back into dataframe 
# df_loc <- df_loc %>% mutate(cluster = fit$cluster) 
data$cluster <- fit$cluster

# View(data)
```

```{r}
# timestamp ->  year  month day hour  minute
# sector -> to factor (there are 17 sectors)
# beat -> to factor (there are 3 beats per sector)

# clean the data a bit more
data$event_clearance_ts = as.POSIXct(strptime(data$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
data$event_clearance_date = as.Date(data$event_clearance_ts)
data$event_clearance_month = month(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_day = weekdays(data$event_clearance_date)
data$event_clearance_hr = hour(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_mn = minute(ymd_hms(as.character(data$event_clearance_ts)))
data$Initial.Type.Group = factor(data$Initial.Type.Group)
data$Event.Clearance.Group = factor(data$Event.Clearance.Group)
data$Zone.Beat = factor(data$Zone.Beat)
data$District.Sector = factor(data$District.Sector)
data$event_clearance_day = factor(data$event_clearance_day)
data

col.names <- paste(c(
  "Event.Clearance.Code"
  , "cluster"
  , "Census.Tract"
  , "event_clearance_day"
  , "Event.Clearance.Group"
  , "Event.Clearance.SubGroup"
  , "District.Sector"
  , "Zone.Beat"
  #, "event_clearance_ts"
  # ,"Incident.Location"
  , "event_clearance_hr"
  , "event_clearance_mn"
  , "event_clearance_month" 
  , "Hundred.Block.Location"
  ), collapse="|")
cols <- grep(col.names, colnames(data))
cols
# corr_matrix <- cor(data[,cols]) # correlations between all predictor vars
# corr_matrix

# cutoff <- 0.5 # should be higher in practice

# highly_corr <- findCorrelation(corr_matrix, cutoff=cutoff)
# print(colnames(spd.911)[highly_corr]) # age is highly correalted with pregnant

train.data <- select(data, cols)
train.data
# data <- data %>% droplevels()

# grep("Hundred.Block.Location", colnames(train.data), invert = T)

predictors <- grep("Hundred.Block.Location", colnames(train.data), invert = T)
outcome <- grep("Hundred.Block.Location", colnames(train.data))

# train.data[,predictors]
frame <- data.frame(train.data[,predictors])
frame
out.factor <- train.data$Hundred.Block.Location
as.vector(out.factor)


control <- rfeControl(functions = rfFuncs, method="cv", number=10)
results <- rfe(frame, out.factor, sizes = c(1:13), rfeControl = control) # this will take AWHILE...

results
ggplot(results)

# chosen features
predictors(results)
```
